In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib_venn import venn2, venn3
import os, re, json
from utility_functions import *
DATE
Out[1]:
'20250627'
In [2]:
working_folder = "C:/Users/Enrico/OneDrive - UGent/run-ionbot"
PXDs = [
    "PXD002057.v0.11.4",
    "PXD005833.v0.11.4",
    "PXD014258.v0.11.4"
]
# filtering = 'global'
# filtering = 'custom'
filtering = 'hybrid'
In [3]:
data = []
for dataset_name in PXDs:
    data.append(pd.read_csv(os.path.join(working_folder, dataset_name, f"openprot-x-trembl-filt-{filtering}-outerjoin.csv")))
for _ in data:
    print(_.shape)
data = pd.concat(data, ignore_index=True)
print(data.shape)
data.tail()
(47930, 66)
(163741, 66)
(131280, 66)
(342951, 66)
Out[3]:
spectrum_title scan spectrum_file precursor_mass_trembl database_peptide_trembl matched_peptide_trembl modifications_trembl database_trembl psm_score_trembl global_q_trembl ... all-explained_open by-intensity-pattern-correlation_open top_tag_rank_nterm_open top_tag_rank_cterm_open top_tag_rank_open predicted_retention_time_open retention_time_error_adjusted_open Same_peptide Same_mod_peptide Same_mods_noRT
342946 ESC-HF-SampleHela5:controllerType=0 controller... 9994 ESC-HF-SampleHela5 1205.651464 HLSVNDLPVGR HLSVNDLPVGR Unmodified T 2.325810 0.000057 ... 0.2528 0.7855 0.0 1.0 0.0 1023.251153 100.694167 True True True
342947 ESC-HF-SampleHela5:controllerType=0 controller... 9996 ESC-HF-SampleHela5 1400.780981 TFIAIKPDGVQR TFIAIKPDGVQR 6|[1263]Gly[K](144.07)_or_6|[4]Carbamidomethyl... T 0.764866 0.001069 ... 0.2335 0.7031 0.0 17.0 0.0 1274.538023 150.482963 True False False
342948 ESC-HF-SampleHela5:controllerType=0 controller... 9997 ESC-HF-SampleHela5 1324.626854 KFEEIPIAHIK KFEEIPIAHIK Unmodified T 1.441000 0.000057 ... 0.1061 0.8813 3.0 4.0 3.0 1105.713169 14.384891 True True True
342949 ESC-HF-SampleHela5:controllerType=0 controller... 9998 ESC-HF-SampleHela5 1436.764379 GVTFNVTTVDTKR GVTFNVTTVDTKR Unmodified T 1.822180 0.000057 ... 0.2252 0.6929 0.0 0.0 0.0 1159.870875 35.633775 True True True
342950 ESC-HF-SampleHela5:controllerType=0 controller... 9999 ESC-HF-SampleHela5 1334.689550 GNEIEPNFSATR GNEIEPNFSATR Unmodified T 1.931300 0.000057 ... 0.1597 0.7231 0.0 2.0 0.0 1285.778264 161.457404 True True True

5 rows × 66 columns

General plot¶

In [4]:
# searches overall overlap
A = data[~data.database_trembl.isna()].spectrum_title
B = data[~data.database_open.isna()].spectrum_title

venn2([set(A),set(B)], 
      set_labels=['TrEMBL','OpenProt'],
      set_colors=[project_palette['trembl'], project_palette['openprot']])
plt.title('Identified spectra overlap (all datasets)')
plt.savefig(f"publication-data/{DATE}-overall-overlap-trembl-openprot-{filtering}-filtering.svg")
No description has been provided for this image
In [5]:
len(set(B))/len(set(A))
Out[5]:
0.9405282625498633
In [6]:
F, counts = make_sankey_plot_with_counts(data, suffixes=['_trembl','_open'])
F.write_image(f"publication-data/{DATE}-Sankey-trembl-openprot-{filtering}-filtering.svg")
F.show()
In [7]:
data3 = counts.loc[['Canonical+Unmodified/Expected','Canonical+Unexpected',
                   'Decoy','Unidentified'],
                ['Canonical+Unmodified/Expected','Canonical+Unexpected',
                 'NonCanonical+Unmodified/Expected','NonCanonical+Unexpected',
                 'Decoy','Unidentified']]
data3.style.background_gradient()
Out[7]:
sankey_open Canonical+Unmodified/Expected Canonical+Unexpected NonCanonical+Unmodified/Expected NonCanonical+Unexpected Decoy Unidentified
sankey_trembl            
Canonical+Unmodified/Expected 226987 1207 677 728 259 15073
Canonical+Unexpected 1281 78567 476 866 73 7862
Decoy 199 73 21 20 879 2425
Unidentified 1796 1165 679 865 773 0
In [8]:
# All spectra
tmp = data3.iloc[:,:]
print(tmp.sum().sum())
print(f"{tmp.sum().sum() / data3.sum().sum():.1%}")
tmp
342951
100.0%
Out[8]:
sankey_open Canonical+Unmodified/Expected Canonical+Unexpected NonCanonical+Unmodified/Expected NonCanonical+Unexpected Decoy Unidentified
sankey_trembl
Canonical+Unmodified/Expected 226987 1207 677 728 259 15073
Canonical+Unexpected 1281 78567 476 866 73 7862
Decoy 199 73 21 20 879 2425
Unidentified 1796 1165 679 865 773 0
In [9]:
# Canonical --> Canonical
tmp = data3.iloc[:2,:2]
print(tmp.sum().sum())
print(f"{tmp.sum().sum() / data3.sum().sum():.1%}")
tmp
308042
89.8%
Out[9]:
sankey_open Canonical+Unmodified/Expected Canonical+Unexpected
sankey_trembl
Canonical+Unmodified/Expected 226987 1207
Canonical+Unexpected 1281 78567
In [10]:
# Canonical --> Unidentified
tmp = data3.iloc[:2,-1:]
print(tmp.sum().sum())
print(f"{tmp.sum().sum() / data3.sum().sum():.1%}")
tmp
22935
6.7%
Out[10]:
sankey_open Unidentified
sankey_trembl
Canonical+Unmodified/Expected 15073
Canonical+Unexpected 7862
In [11]:
# Canonical --> NonCanonical
tmp = data3.iloc[:2,2:4]
print(tmp.sum().sum())
print(f"{tmp.sum().sum() / data3.sum().sum():.1%}")
tmp
2747
0.8%
Out[11]:
sankey_open NonCanonical+Unmodified/Expected NonCanonical+Unexpected
sankey_trembl
Canonical+Unmodified/Expected 677 728
Canonical+Unexpected 476 866
In [12]:
# Any Peptide --> Any Peptide
tmp = data3.iloc[:-1,:-1]
print(tmp.sum().sum())
print(f"{tmp.sum().sum() / data3.sum().sum():.1%}")
tmp
312313
91.1%
Out[12]:
sankey_open Canonical+Unmodified/Expected Canonical+Unexpected NonCanonical+Unmodified/Expected NonCanonical+Unexpected Decoy
sankey_trembl
Canonical+Unmodified/Expected 226987 1207 677 728 259
Canonical+Unexpected 1281 78567 476 866 73
Decoy 199 73 21 20 879
In [13]:
# Unidentified --> Canonical
tmp = data3.iloc[[3],:2]
print(tmp.sum().sum())
print(f"{tmp.sum().sum() / data3.sum().sum():.1%}")
tmp
2961
0.9%
Out[13]:
sankey_open Canonical+Unmodified/Expected Canonical+Unexpected
sankey_trembl
Unidentified 1796 1165
In [14]:
# Any --> NonCanonical
tmp = data3.iloc[:,2:4]
print(tmp.sum().sum())
print(f"{tmp.sum().sum() / data3.sum().sum():.1%}")
tmp
4332
1.3%
Out[14]:
sankey_open NonCanonical+Unmodified/Expected NonCanonical+Unexpected
sankey_trembl
Canonical+Unmodified/Expected 677 728
Canonical+Unexpected 476 866
Decoy 21 20
Unidentified 679 865

Zoom on noncanon¶

In [15]:
F, _ = make_sankey_plot_with_counts(data[(data.isCanonical_open=='NonCanonical')&(data.database_open=='T')], 
                                        suffixes=['_trembl','_open'])
F.write_image(f"publication-data/{DATE}-Zoomed-Sankey-trembl-openprot-{filtering}-filtering.svg")
F.show()

In [ ]:
autosave(extra_labels='-'+filtering)
filtering